Prepare data to ML¶

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.io as pio
pio.renderers.default='notebook'
import plotly.express as px
# Main data
url_src = "data/LBNL_file.csv"
df = pd.read_csv(url_src, low_memory=False, parse_dates=True)
# cols = ['state','city','zip_code','customer_segment','self_installed','system_size_DC','installation_date']
# df = df[cols]
df = df.replace(-9999,np.nan)
r = {"COM": "NON-RES","NON-PROFIT": "NON-RES","SCHOOL": "NON-RES",'GOV': 'NON-RES'}
df['customer_segment'] = df['customer_segment'].replace(r, regex=True)
# Remove too low price data to minimice errors
df = df.loc[df.total_installed_price > 100]
df['city'] = df['city'].str.upper()
df = df.loc[df['system_size_DC'] != 0]
# Only California
df = df.loc[df['state'] == 'CA']
df['installation_date'] = pd.to_datetime(df['installation_date'])
df = df.sort_values(by='installation_date')
# Create years
df['year'] = df['installation_date'].dt.year
# Remove early years (less data)
df = df.loc[df['year'] >= 2015]
pd.set_option("display.max_columns", None)
# Remove data that have additional modules or batteries to get comparable data
df = df.loc[df.efficiency_module_2.isna()]
df = df.loc[df.efficiency_module_3.isna()]
df = df[df.total_installed_price.notna()]
df = df.loc[df.battery_rated_capacity_kWh.isna()]
df = df.loc[df.battery_rated_capacity_kW.isna()]
df = df[df.columns.drop(list(df.filter(regex='_2')))]
df = df[df.columns.drop(list(df.filter(regex='_3')))]
df = df.loc[df.installer_name != 'Tesla Energy']
df = df.drop(['battery_rated_capacity_kWh', 'battery_rated_capacity_kW','installer_name',
              'system_ID_1', 'third_party_owned', 'state', 'new_construction','zip_code'], axis=1)
df = df.loc[df['rebate_or_grant'] == 0]
df = df.drop(['rebate_or_grant', 'tilt_1', 'azimuth_1', 'ground_mounted'], axis=1)
# Create RES residential column (other data is non residential)
df.loc[df['customer_segment'] == 'RES','RES'] = 1
df['RES'] = df['RES'].replace(np.nan,0)
# Still some additional modules remove
df = df.loc[df.solar_storage_hybrid_inverter_1 == 0]
df = df.loc[df.additional_modules == 0]
df = df.loc[df.bifacial_module_1 == 0]
df = df.loc[df.technology_module_1 != '-9999']
df = df.drop(['additional_modules', 'bifacial_module_1'], axis=1)
df = df.loc[df.additional_inverters == 0]
# Replace data
df = df.replace(np.nan,0)
df = df.replace('-9999','Unknown')
df = df.loc[df.inverter_model_1 != 'Unknown']
df = df.loc[df.BIPV_module_1 == 0]
df = df.drop(['BIPV_module_1'], axis=1)
# Create multiplier to get stars
df['x'] = round(df['total_installed_price']/df['system_size_DC']*df['efficiency_module_1'],2)
df = df.loc[df.x > 1]
# Remove self installed to get more coparable data
df = df.loc[df.self_installed == 0]
df = df.loc[df.expansion_system == 0]
df = df.loc[df.multiple_phase_system == 0]
# Drop useless cols
df = df.drop(['self_installed', 'expansion_system', 'multiple_phase_system', 'customer_segment',
              'utility_service_territory','data_provider_1', 'additional_inverters',
             'city', 'installation_date', 'tracking','solar_storage_hybrid_inverter_1'], axis=1)
df = df.reset_index()
df = df.drop(['index'], axis=1)
# Create stars by cut function
xmin = df.x.min()
x1 = round(df.x.mean()*0.6666,2)
x2 = round(df.x.mean()*1.3333,2)
xmax = df.x.max()
xbin = [xmin,x1,x2,xmax]
df['stars'] = pd.cut(df.x, 
                   bins=xbin,
                   labels=[1,2,3]
                  )
df.stars = df.stars.astype(float)
# -----DF2 (model)-----
df2 = df.copy()
# Categors
df2['module_manufacturer_1'] = df2['module_manufacturer_1'].astype('category').cat.codes
df2['module_model_1'] = df2['module_model_1'].astype('category').cat.codes
df2['technology_module_1'] = df2['technology_module_1'].astype('category').cat.codes
df2['inverter_manufacturer_1'] = df2['inverter_manufacturer_1'].astype('category').cat.codes
df2['inverter_model_1'] = df2['inverter_model_1'].astype('category').cat.codes
print('Shape:',df.shape)
print('Shape:',df2.shape)
print('Cut bin limits:',xbin)
Shape: (201674, 20)
Shape: (201674, 20)
Cut bin limits: [2.47, 490.72, 981.52, 13659.29]

Plotly scatter for investicate data¶

In [2]:
fig = px.scatter_3d(df, x='total_installed_price', y='system_size_DC', z='stars',
                  color='efficiency_module_1', animation_frame='year', opacity=0.5, 
                    hover_name='module_manufacturer_1',
                    hover_data=['system_size_DC', 'total_installed_price',
                               'module_model_1', 'module_quantity_1', 'technology_module_1',
                               'nameplate_capacity_module_1', 'efficiency_module_1',
                               'inverter_manufacturer_1', 'inverter_model_1', 'inverter_quantity_1',
                               'micro_inverter_1', 'built_in_meter_inverter_1',
                               'output_capacity_inverter_1', 'DC_optimizer', 'inverter_loading_ratio', 'RES', 'x'],
                width=1000, height=1000,range_color=[0,0.25],
                range_x=[0,df['total_installed_price'].max()], range_y=[0,df['system_size_DC'].max()], 
                range_z=[1,3],
               title='Data 2015-2019 all customers')
fig.show()

Notifications¶

  • Test showed that there where basicly 3 reasonable class.
  • Still wonder will I give some how points also other things than size and price and how.
  • Anyway this will be my sample ML data.
  • Basicly this model is look the best module for efficeny and $/kW ratio.
  • Still wondering will I drop price ratio because it's not neccessary tell quality.

adj_col_names = [] for name in df.columns: adj_col_names.append(name.replace('_1',''))

df.columns = adj_col_names df2.columns = adj_col_names df = df.loc[df['stars'].notna()] df2 = df2.loc[df2['stars'].notna()]

df.to_csv("data/rfc_sample.csv") df2.to_csv("data/rfc_model.csv")

For possible further development¶

  • Customer perspective
    • What I need or have afford?
    • How to find best available option?
  • Retailer perspective
    • How to find the best equipment in terms of price-quality ratio for sale?
  • Functions
    • Could I creat functions that do the job in certain parameters.
    • The parameters could include information on what aspects the customer would need